# Source script from top-level project folder
source("../process_plate_run.R")
#— 1. pull from params (or use defaults)
plot_dir <- params$plot_dir %||% "output_prot/plots"
data_dir <- params$data_dir %||% "output_prot/export_data"
report_dir <- params$report_dir %||% "output_prot/reports"
#— 2. create all dirs
walk(
c(plot = plot_dir, data = data_dir, reports = report_dir),
~ if (!dir_exists(.x)) dir_create(.x, recurse = TRUE)
)
#— 3. register plot_dir with knitr
opts_chunk$set(fig.path = paste0(plot_dir, "/"))
#
# 1. Define the folder containing all Excel inputs
input_folder <- "Input_prot"
# 2. Find all .xlsx files (full paths), excluding temp files that start with "~$"
excel_files <- list.files(
path = input_folder,
pattern = "\\.xlsx?$",
full.names = TRUE
)
excel_files <- excel_files[!grepl("^~\\$", basename(excel_files))]
# 3. Loop over each file
for (file in excel_files) {
# 3a. Create a clean object name (strip out folder & extension)
name <- tools::file_path_sans_ext(basename(file))
# 3b. Determine which sheet to read (second if possible, otherwise first)
sheet_names <- excel_sheets(file)
sheet_to_read <- if (length(sheet_names) >= 2) sheet_names[2] else sheet_names[1]
# 3c. Read the chosen sheet, suppressing verbose messages
data <- suppressMessages(read_excel(file, sheet = sheet_to_read))
# 3d. Assign the data.frame to the global environment under "name"
assign(name, data, envir = .GlobalEnv)
# 3e. Print a message to confirm successful load
message("Loaded: ", name, " (sheet = '", sheet_to_read, "')")
}
# 1. Define which wells were used as blanks in each run
blanks1 <- c("A01", "A02", "A03")
blanks2 <- c("H11", "H12")
#
# 2. Build named lists of raw datasets and their corresponding blank vectors
listprot <- list(pr1 = pr1, pr2 = pr2)
#
listBlanks <- list(blanks1, blanks2)
#
# 3. Run the wrapper: it calls tidy_and_correct() on each dataset
tidy_all(listprot, listBlanks)
Description of joindf_by_id Function
The joindf_by_id function takes two data frames
(df1 and df2) and merges them by matching
unique identifiers related to samples, specifically using either a
Cell_ID column or a plate well column.
Key steps and features: - Column
Cleaning: Trims whitespace from column names in both data
frames to avoid join errors caused by accidental spaces. - Key
Column Verification: Checks that at least one data frame
contains a Cell_ID column and the other contains a
plate well column—these serve as the join keys. -
Role Assignment: Depending on which data frame contains
Cell_ID, that data frame is assigned as the base
(df_cell), and the other becomes the joining data
(df_plate). - Rename Join Keys: Renames
both join columns to a common key name (join_id) to
facilitate a straightforward left join. - Perform Join:
Conducts a left join, keeping all rows from the base data frame and
adding matching data from the other. - Identify Unmatched
Rows: Any rows in the larger data frame without matches are
saved separately for troubleshooting. - Output
Files:
- Saves the merged data frame as a CSV named according to the provided
output_name.
- Writes unmatched rows into a separate CSV file.
- Global Environment Assignment: Assigns the merged
data frame into the global R environment under the same name as the
output file (minus the .csv extension). -
Reporting: Prints messages listing any unmatched
identifiers and returns a summary report containing counts of
matched/unmatched rows and file paths of saved CSVs.
# 1. Create output subdirectory
# 1. Create subdirectory for joined weights inside data_dir
joined_weights_dir <- file.path(data_dir, "joined_weights_prot")
if (!dir.exists(joined_weights_dir)) dir.create(joined_weights_dir, recursive = TRUE)
# 1. Build weight and PE data frame lists
list_weights <- list(
pr1_weights,
pr2_weights
)
pr_list <- list(
pr1 = pr1_tidy,
pr2 = pr2_tidy
)
# 2. Extract names to use in assign_name and filenames
df_names <- names(pr_list)
# 3. Join + save using joindf_by_id
mapply(
function(df1, df2, name) {
assign_name <- paste0(name, "_weights_joined")
csv_path <- file.path(joined_weights_dir, paste0(assign_name, ".csv"))
joindf_by_id(
df1 = df1,
df2 = df2,
key_df1 = "Cell_ID",
key_df2 = "plate well",
assign_name = assign_name,
save_csv_path = csv_path
)
},
df1 = pr_list,
df2 = list_weights,
name = df_names,
SIMPLIFY = FALSE
)
## $assigned_to
## [1] "pr1_weights_joined"
##
## $rows_total
## [1] 96
##
## $columns_total
## [1] 8
##
## $unmatched_rows
## [1] 0
## $assigned_to
## [1] "pr2_weights_joined"
##
## $rows_total
## [1] 96
##
## $columns_total
## [1] 7
##
## $unmatched_rows
## [1] 0
## $pr1
## # A tibble: 96 × 8
## join_id X600 X650 X750 ID weights `Tray cell`
## <chr> <dbl> <dbl> <dbl> <chr> <dbl> <chr>
## 1 A01 -0.00667 -0.00733 -0.00833 Blank01 0 A1
## 2 A02 -0.000667 -0.00233 -0.00433 Blank02 0 A2
## 3 A03 0.00733 0.00967 0.0127 Blank03 0 A3
## 4 A04 0.136 0.156 0.187 Lab_01 10.7 A7
## 5 A05 0.116 0.133 0.160 Lab_01 9.6 A8
## 6 A06 0.157 0.180 0.216 Lab_01 10.6 A9
## 7 A07 0.0433 0.0527 0.0687 Lab_02 11 A10
## 8 A08 0.0833 0.0917 0.106 Lab_02 10.3 A11
## 9 A09 0.0553 0.0667 0.0777 Lab_02 11 A12
## 10 A10 0.0433 0.0507 0.0617 Lab_03 11 A13
## # ℹ 86 more rows
## # ℹ 1 more variable: date <dttm>
##
## $pr2
## # A tibble: 96 × 7
## join_id X600 ID weights `sample weight` `Tray well` date
## <chr> <dbl> <chr> <dbl> <dbl> <chr> <dttm>
## 1 A01 0.289 Lab_01 26.6 1 a1 2025-05-30 00:00:00
## 2 A02 0.196 Lab_01 27 2 a2 2025-05-30 00:00:00
## 3 A03 0.165 Lab_02 24.1 3 a3 2025-05-30 00:00:00
## 4 A04 0.234 Lab_02 25.9 4 a4 2025-05-30 00:00:00
## 5 A05 0.332 Lab_03 25 5 a5 2025-05-30 00:00:00
## 6 A06 0.316 Lab_03 26 6 a6 2025-05-30 00:00:00
## 7 A07 0.194 Lab_04 25.3 7 a7 2025-05-30 00:00:00
## 8 A08 0.132 Lab_04 23.9 8 a8 2025-05-30 00:00:00
## 9 A09 0.240 Lab_05 23.8 9 a9 2025-05-30 00:00:00
## 10 A10 0.262 Lab_05 25.7 10 a10 2025-05-30 00:00:00
## # ℹ 86 more rows
# Example data: Replace this with your actual data
# You can also use `read.csv("your_data.csv")` if loading from a file
# Fit linear model
model <- lm(X600_cor ~ concentration, data = calibration_prot)
# Summary of the model (for table output, if needed)
summary(model)
##
## Call:
## lm(formula = X600_cor ~ concentration, data = calibration_prot)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.06295 -0.02037 -0.01329 0.02079 0.06805
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.013806 0.021665 0.637 0.538
## concentration 0.155266 0.007345 21.139 1.25e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.03983 on 10 degrees of freedom
## Multiple R-squared: 0.9781, Adjusted R-squared: 0.9759
## F-statistic: 446.8 on 1 and 10 DF, p-value: 1.248e-09
# Plot
prot_curve_plot <- ggplot(calibration_prot, aes(x = concentration, y = X600_cor)) +
geom_point(size = 3) +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
labs(
title = "Protein Standard Curve (Lowry Method)",
x = "Protein Concentration (mg/mL)",
y = "Absorbance (600 nm)"
) +
annotate(
"text",
x = max(calibration_prot$concentration) * 0.5,
y = max(calibration_prot$X600_cor) * 0.9,
label = paste0("y = ", round(coef(model)[2], 4), "x + ", round(coef(model)[1], 4),
"\nR² = ", round(summary(model)$r.squared, 4)),
size = 4, hjust = 0
) +
theme_minimal()
# Save using save_object()
save_object(prot_curve_plot,
filename = "protein_standard_curve",
directory = "plots",
width = 6,
height = 5,
dpi = 300)
print(prot_curve_plot)
##
Calculate protein concentration with standard curve regression
# Use the regression model to predict concentration from absorbance
export_dir <- "output_prot/export_data"
plot_dir <- file.path("output_prot", "plots")
dir.create(export_dir, recursive = TRUE, showWarnings = FALSE)
dir.create(plot_dir, recursive = TRUE, showWarnings = FALSE)
# ---- Step 1: Calibration
intercept <- coef(model)[1]
slope <- coef(model)[2]
# ---- Step 2: Harmonize inputs
common_cols <- intersect(names(pr2_weights_joined), names(pr1_weights_joined))
pr1_weights_joined <- pr1_weights_joined %>% mutate(weights = as.numeric(weights), date = parse_date_time(date, orders = c("ymd", "dmy", "mdy")))
pr2_weights_joined <- pr2_weights_joined %>% mutate(weights = as.numeric(weights), date = parse_date_time(date, orders = c("ymd", "dmy", "mdy")))
# ---- Step 3: Combine datasets and calculate concentrations
pr_combined <- bind_rows(pr2_weights_joined[common_cols], pr1_weights_joined[common_cols])
pr_combined$con_mg_per_ml <- (pr_combined$X600 - intercept) / slope
pr_combined$Protein_mg_total <- pr_combined$con_mg_per_ml * 0.5
pr_combined$Protein_mg_per_g <- (pr_combined$Protein_mg_total / pr_combined$weights) * 1000
# ---- Step 4: Filter and factor runs
pr_combined <- pr_combined %>%
filter(is.finite(Protein_mg_per_g), !is.na(Protein_mg_per_g)) %>%
mutate(run = as.factor(match(as.Date(date), unique(as.Date(date)))))
# Save combined data
write_csv(pr_combined, file.path(export_dir, "pr_combined.csv"))
# ---- Step 5: Statistical tests
t_test_result <- t.test(Protein_mg_per_g ~ run, data = pr_combined)
wilcox_result <- wilcox.test(Protein_mg_per_g ~ run, data = pr_combined)
# Save test summaries
capture.output(t_test_result, file = file.path(export_dir, "t_test_run_comparison.txt"))
capture.output(wilcox_result, file = file.path(export_dir, "wilcox_test_run_comparison.txt"))
# ---- Step 6: Plot protein content by run
protein_run_plot <- ggplot(pr_combined, aes(x = run, y = Protein_mg_per_g, fill = run)) +
geom_boxplot(outlier.shape = NA, width = 0.6) +
geom_jitter(width = 0.1, alpha = 0.5, color = "black") +
labs(title = "Protein Content by Run", x = "Run", y = "Protein (mg/g)") +
theme_minimal() +
theme(legend.position = "none", plot.title = element_text(hjust = 0.5))
save_object(
object = protein_run_plot,
filename = "protein_content_by_run",
directory = "plots",
subdir = "replicate_analysis/protein"
)
# ---- Step 7: Outlier detection
df_runs12 <- pr_combined %>% filter(run %in% c("1", "2"))
iqr_vals <- IQR(df_runs12$Protein_mg_per_g, na.rm = TRUE)
q1 <- quantile(df_runs12$Protein_mg_per_g, 0.25, na.rm = TRUE)
q3 <- quantile(df_runs12$Protein_mg_per_g, 0.75, na.rm = TRUE)
lower_bound <- q1 - 1.5 * iqr_vals
upper_bound <- q3 + 1.5 * iqr_vals
pr_combined <- pr_combined %>%
mutate(outlier_flag = ifelse(
Protein_mg_per_g < lower_bound | Protein_mg_per_g > upper_bound,
"outlier", "normal"
))
# Save outlier-flagged data
write_csv(pr_combined, file.path(export_dir, "pr_combined_with_outliers.csv"))
pr_combined_clean <- pr_combined %>% filter(outlier_flag == "normal")
write_csv(pr_combined_clean, file.path(export_dir, "pr_combined_clean.csv"))
# ---- Step 8: Replicate analysis
analyze_replicates(
data = pr_combined_clean,
id_col = "ID",
join_col = "join_id",
weight_col = "weights",
date_col = "date",
output_prefix = "pr_rep",
choose_best_3 = TRUE,
dir = export_dir
)
## # A tibble: 41 × 37
## ID Protein_mg_per_g_mean Protein_mg_total_mean X600_mean
## <fct> <dbl> <dbl> <dbl>
## 1 Lab_01 34.8 0.481 0.163
## 2 Lab_02 18.1 0.282 0.101
## 3 Lab_03 40.1 0.796 0.261
## 4 Lab_04 23.2 0.342 0.120
## 5 Lab_05 28.8 0.252 0.092
## 6 Lab_06 47.3 1.17 0.378
## 7 Lab_07 27.8 0.466 0.158
## 8 Lab_08 24.7 0.210 0.079
## 9 Lab_09 31.6 0.344 0.121
## 10 Lab_10 30.0 0.465 0.158
## # ℹ 31 more rows
## # ℹ 33 more variables: con_mg_per_ml_mean <dbl>, Protein_mg_per_g_sd <dbl>,
## # Protein_mg_total_sd <dbl>, X600_sd <dbl>, con_mg_per_ml_sd <dbl>,
## # Protein_mg_per_g_se <dbl>, Protein_mg_total_se <dbl>, X600_se <dbl>,
## # con_mg_per_ml_se <dbl>, Protein_mg_per_g_cv <dbl>,
## # Protein_mg_total_cv <dbl>, X600_cv <dbl>, con_mg_per_ml_cv <dbl>,
## # Protein_mg_per_g_max_dev_pct <dbl>, Protein_mg_total_max_dev_pct <dbl>, …
# ---- Step 9: Replicate summary and plot
protein_summary <- read_csv(file.path(export_dir, "pr_rep_summary.csv"))
protein_var <- "Protein_mg_per_g"
se_col_name <- paste0(protein_var, "_se")
mean_col_name <- paste0(protein_var, "_mean")
output_prefix <- file.path(plot_dir, "replicate_analysis", paste0(protein_var, "_replicate_analysis"))
dir.create(dirname(output_prefix), recursive = TRUE, showWarnings = FALSE)
protein_plot <- graph_replicates_custom_error(
data = protein_summary,
id_col = "ID",
value_col = mean_col_name,
se_col = se_col_name,
output_prefix = output_prefix
)
save_object(
object = protein_plot,
filename = paste0(protein_var, "_replicate_plot"),
directory = "plots",
format = "html",
subdir = "replicate_analysis/protein"
)
# ---- Step 10: Summary stats
summary_stats <- protein_summary %>%
summarise(
total_rows = n(),
high_cv_count = sum(Protein_mg_per_g_cv > 0.2, na.rm = TRUE),
average_cv = mean(Protein_mg_per_g_cv, na.rm = TRUE),
high_cv_percent = mean(Protein_mg_per_g_cv > 0.2, na.rm = TRUE) * 100
)
write_csv(summary_stats, file.path(export_dir, "protein_summary_stats.csv"))
# Also print to console (optional)
print(summary_stats)
## # A tibble: 1 × 4
## total_rows high_cv_count average_cv high_cv_percent
## <int> <int> <dbl> <dbl>
## 1 41 13 0.152 31.7
# 1. Merge enhanced replicate summary with metadata by column "ID"
# This will save "PErep_final.csv" and assign the merged data frame to `PErep_final`.
# 1. Define export folder and create it if needed
final_export_dir <- file.path("output prot", "export data", "Samples Analysis Final")
dir.create(final_export_dir, recursive = TRUE, showWarnings = FALSE)
#
# 2. Join PErep_enhanced with Sample data
joindf_by_id(
df1 = protein_summary,
df2 = `Sample data`,
save_csv_path = file.path(final_export_dir, "pr_rep_final.csv"),
assign_name = "pr_rep_final",
key_df1 = "ID",
key_df2 = "ID"
)
## $assigned_to
## [1] "pr_rep_final"
##
## $rows_total
## [1] 69
##
## $columns_total
## [1] 44
##
## $unmatched_rows
## [1] 0
## # A tibble: 69 × 44
## join_id Protein_mg_per_g_mean Protein_mg_total_mean X600_mean
## <chr> <dbl> <dbl> <dbl>
## 1 Lab_01 34.8 0.481 0.163
## 2 Lab_02 18.1 0.282 0.101
## 3 Lab_03 40.1 0.796 0.261
## 4 Lab_04 23.2 0.342 0.120
## 5 Lab_05 28.8 0.252 0.092
## 6 Lab_06 47.3 1.17 0.378
## 7 Lab_07 27.8 0.466 0.158
## 8 Lab_08 24.7 0.210 0.079
## 9 Lab_09 31.6 0.344 0.121
## 10 Lab_10 30.0 0.465 0.158
## # ℹ 59 more rows
## # ℹ 40 more variables: con_mg_per_ml_mean <dbl>, Protein_mg_per_g_sd <dbl>,
## # Protein_mg_total_sd <dbl>, X600_sd <dbl>, con_mg_per_ml_sd <dbl>,
## # Protein_mg_per_g_se <dbl>, Protein_mg_total_se <dbl>, X600_se <dbl>,
## # con_mg_per_ml_se <dbl>, Protein_mg_per_g_cv <dbl>,
## # Protein_mg_total_cv <dbl>, X600_cv <dbl>, con_mg_per_ml_cv <dbl>,
## # Protein_mg_per_g_max_dev_pct <dbl>, Protein_mg_total_max_dev_pct <dbl>, …
# 4. Define output directory for replicate analysis plots
rep_plot_dir <- file.path("output_prot", "plots", "replicate_analysis")
dir.create(rep_plot_dir, recursive = TRUE, showWarnings = FALSE)
# 5. Generate histogram with error bars for PE_mg_per_g_sample_mean
graph_replicates_custom_error(
data = pr_rep_final,
id_col = "Location",
value_col = "Protein_mg_per_g_mean",
se_col = "Protein_mg_per_g_se",
output_prefix = file.path(rep_plot_dir, "protein_rep_analy_bylocation")
)
pr_location <- pr_rep_final %>%
filter(!Location %in% c("Lima Market Freeze Dry", "Ilo Freeze Dry", "Ilo oven dry", "Ilo Fresh", "Lima Market Fresh"))
pr_location_cham <- pr_rep_final %>%
filter(!variety %in% c("F.Glom"))
# 6. Run group comparisons and print outputs
###################################################Location
compare_groups(
data = pr_location_cham,
response_var = "Protein_mg_per_g_mean",
group_var = "Location",
subfolder_name = "pr_Location_cham"
)
################################################Life_S
compare_groups(
data = pr_location_cham,
response_var = "Protein_mg_per_g_mean",
group_var = "Life_S",
subfolder_name = "pr_Life_S_cham"
)
###########################################Variety
pr_paracas_marcona <- pr_rep_final %>%
filter(Location %in% c("Mendieta", "7H", "Caro Caido"))
compare_groups(
data = pr_paracas_marcona,
response_var = "Protein_mg_per_g_mean",
group_var = "variety",
subfolder_name = "pr_variety"
)
#################################Gam Cham
pr_gamtetra <- pr_location %>%
filter(Life_S %in% c("Gam/Tetra", "Gam", "Tetra"))
compare_groups(
data = pr_gamtetra,
response_var = "Protein_mg_per_g_mean",
group_var = "Location",
subfolder_name = "pr_gamtetra_location"
)